# Replication Mode Detection and Helpers
# Alexander F. Gazmararian
# agazmararian@gmail.com
#
# This module provides functions for dual-mode replication:
# - "full" mode: All data available, compute Conley SEs fresh
# - "anonymized" mode: Use cached calculations to protect respondent privacy
#
# The mode is auto-detected based on whether coordinate data is available.
# Mode is stored in options() to persist across sourced scripts.

library(here)

# Check if mode was already set via options (preferred method for cross-script persistence)
# Options persist throughout the R session, unlike global variables which can be
# overwritten when scripts are sourced
if (is.null(getOption("replication.mode"))) {
  # Legacy support: check for global variable
  if (exists("REPLICATION_MODE") && !is.null(REPLICATION_MODE) && REPLICATION_MODE != "") {
    options(replication.mode = REPLICATION_MODE)
  }
}

# Keep REPLICATION_MODE in sync with options for backward compatibility
REPLICATION_MODE <- getOption("replication.mode", NULL)

#' Detect replication mode based on available data
#' 
#' Checks if the geo-referenced survey data file exists and contains valid coordinates.
#' If coordinates are available, returns "full" mode; otherwise "anonymized".
#' 
#' @return Character string: "full" or "anonymized"
#' @export
detect_replication_mode <- function() {
  geo_file <- here("data", "inter", "survey_visibility_processed_with_geo.rds")
  
 if (file.exists(geo_file)) {
    # Check if the file has real coordinates (not a stub/placeholder)
    tryCatch({
      g <- readRDS(geo_file)
      
      # Check for sf geometry column with valid coordinates
      if (inherits(g, "sf") && "geometry" %in% names(g)) {
        coords <- sf::st_coordinates(g)
        if (nrow(coords) > 0 && !all(is.na(coords))) {
          message("Replication mode: FULL (coordinate data available)")
          return("full")
        }
      }
      
      # Check for lat_zip/lon_zip columns as fallback
      if ("lat_zip" %in% names(g) && "lon_zip" %in% names(g)) {
        if (!all(is.na(g$lat_zip)) && !all(is.na(g$lon_zip))) {
          message("Replication mode: FULL (coordinate columns available)")
          return("full")
        }
      }
    }, error = function(e) {
      message("Could not read geo file: ", e$message)
    })
  }
  
  message("Replication mode: ANONYMIZED (using cached calculations)")
  return("anonymized")
}

#' Initialize replication mode
#' 
#' Sets the replication mode if not already set, using options() for
#' robust cross-script persistence. Call this at the start of scripts 
#' that depend on the mode.
#' 
#' @return The current replication mode
#' @export
init_replication_mode <- function() {
  # First check options (most reliable for cross-script persistence)
  current_mode <- getOption("replication.mode", NULL)
  
  if (is.null(current_mode) || current_mode == "") {
    # Not set yet - detect and store in options
    current_mode <- detect_replication_mode()
    options(replication.mode = current_mode)
  }
  
  # Keep global variable in sync for backward compatibility
  REPLICATION_MODE <<- current_mode
  
  return(current_mode)
}

#' Get variance-covariance matrix (cached or computed fresh)
#' 
#' In anonymized mode, retrieves the pre-computed vcov matrix from cache.
#' In full mode, returns a vcov_conley specification for fixest.
#' 
#' @param model_name Character string identifying the model (used as cache key)
#' @param lat Column name for latitude (default: "lat_zip")
#' @param lon Column name for longitude (default: "lon_zip")
#' @param cutoff Distance cutoff in km for Conley SEs (default: 50)
#' @param cache_file Path to vcov cache file
#' @return In anonymized mode: a vcov matrix. In full mode: a vcov_conley specification.
#' @export
get_cached_or_fresh_vcov <- function(model_name, 
                                      lat = "lat_zip", 
                                      lon = "lon_zip", 
                                      cutoff = 50,
                                      cache_file = here("data", "cache", "vcov_conley_cache.rds")) {
  
  mode <- init_replication_mode()
  
  if (mode == "anonymized") {
    if (!file.exists(cache_file)) {
      stop("Anonymized mode requires cached vcov matrices. File not found: ", cache_file)
    }
    
    vcov_cache <- readRDS(cache_file)
    
    if (!model_name %in% names(vcov_cache)) {
      stop("Model '", model_name, "' not found in vcov cache. Available models: ", 
           paste(names(vcov_cache), collapse = ", "))
    }
    
    return(vcov_cache[[model_name]])
  } else {
    # Full mode: return the vcov_conley specification
    return(fixest::vcov_conley(cutoff = cutoff, lat = lat, lon = lon))
  }
}

#' Save vcov matrix to cache
#' 
#' Adds or updates a vcov matrix in the cache file.
#' Should be called in full mode after estimating each model.
#' 
#' @param model_name Character string identifying the model
#' @param vcov_matrix The variance-covariance matrix to cache
#' @param cache_file Path to vcov cache file
#' @export
save_vcov_to_cache <- function(model_name, 
                                vcov_matrix,
                                cache_file = here("data", "cache", "vcov_conley_cache.rds")) {
  
  # Load existing cache or create new
  if (file.exists(cache_file)) {
    vcov_cache <- readRDS(cache_file)
  } else {
    vcov_cache <- list()
    dir.create(dirname(cache_file), recursive = TRUE, showWarnings = FALSE)
  }
  
  # Add/update the vcov matrix
  vcov_cache[[model_name]] <- vcov_matrix
  
  # Save back
  saveRDS(vcov_cache, cache_file)
  message("Cached vcov for model: ", model_name)
}

#' Check if running in anonymized mode
#' 
#' @return TRUE if in anonymized mode, FALSE otherwise
#' @export
is_anonymized_mode <- function() {
  init_replication_mode() == "anonymized"
}

#' Check if running in full mode
#' 
#' @return TRUE if full mode, FALSE otherwise
#' @export
is_full_mode <- function() {
  init_replication_mode() == "full"
}

#' Get path to cached distance calculations
#' 
#' These are stored in data/cache/ (not data/inter/) because they:
#' - Cannot be regenerated in anonymized mode (require coordinates)
#' - Should persist across pipeline runs
#' - Are conceptually "cached computations" not "intermediate outputs"
#' 
#' @return Path to the cached distance file
#' @export
get_distance_cache_path <- function() {
  here("data", "cache", "respondent_distance2project_processed.rds")
}

#' Check if distance cache exists
#' 
#' @return TRUE if cache exists, FALSE otherwise
#' @export
distance_cache_exists <- function() {
  file.exists(get_distance_cache_path())
}

#' Get path to cached population density calculations
#' 
#' These are stored in data/cache/ (not data/inter/) because they:
#' - Cannot be regenerated in anonymized mode (require coordinates)
#' - Should persist across pipeline runs
#' - Are conceptually "cached computations" not "intermediate outputs"
#' 
#' @return Path to the cached population density file
#' @export
get_popdensity_cache_path <- function() {
  here("data", "cache", "survey_popdensity_processed.csv")
}

#' Check if population density cache exists
#' 
#' @return TRUE if cache exists, FALSE otherwise
#' @export
popdensity_cache_exists <- function() {
  file.exists(get_popdensity_cache_path())
}

#' Get path to cached FIPS assignments
#' 
#' These are stored in data/cache/ (not data/inter/) because they:
#' - Cannot be regenerated in anonymized mode (require coordinates for spatial join)
#' - Should persist across pipeline runs
#' - Are county-level identifiers (not precise locations)
#' 
#' Contains: response_id, fips, fips.pre, fips.bea, county, state
#' 
#' @return Path to the cached FIPS file
#' @export
get_fips_cache_path <- function() {
  here("data", "cache", "survey_fips_processed.csv")
}

#' Check if FIPS cache exists
#' 
#' @return TRUE if cache exists, FALSE otherwise
#' @export
fips_cache_exists <- function() {
  file.exists(get_fips_cache_path())
}

